doctra 0.1.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctra/__init__.py +21 -18
- doctra/cli/main.py +5 -2
- doctra/cli/utils.py +12 -3
- doctra/engines/layout/paddle_layout.py +13 -78
- doctra/engines/vlm/provider.py +86 -58
- doctra/engines/vlm/service.py +10 -14
- doctra/exporters/html_writer.py +1235 -0
- doctra/parsers/structured_pdf_parser.py +35 -15
- doctra/parsers/table_chart_extractor.py +66 -28
- doctra/ui/__init__.py +5 -0
- doctra/ui/app.py +1012 -0
- doctra/utils/progress.py +428 -0
- doctra/utils/structured_utils.py +49 -49
- doctra/version.py +1 -1
- {doctra-0.1.1.dist-info → doctra-0.3.0.dist-info}/METADATA +45 -6
- {doctra-0.1.1.dist-info → doctra-0.3.0.dist-info}/RECORD +19 -15
- {doctra-0.1.1.dist-info → doctra-0.3.0.dist-info}/WHEEL +0 -0
- {doctra-0.1.1.dist-info → doctra-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {doctra-0.1.1.dist-info → doctra-0.3.0.dist-info}/top_level.txt +0 -0
doctra/__init__.py
CHANGED
@@ -1,19 +1,22 @@
|
|
1
|
-
"""
|
2
|
-
Doctra - Document Parsing Library
|
3
|
-
Parse, extract, and analyze documents with ease
|
4
|
-
"""
|
5
|
-
|
6
|
-
from .parsers.structured_pdf_parser import StructuredPDFParser
|
7
|
-
from .parsers.table_chart_extractor import ChartTablePDFParser
|
8
|
-
from .version import __version__
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
'
|
13
|
-
'
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
1
|
+
"""
|
2
|
+
Doctra - Document Parsing Library
|
3
|
+
Parse, extract, and analyze documents with ease
|
4
|
+
"""
|
5
|
+
|
6
|
+
from .parsers.structured_pdf_parser import StructuredPDFParser
|
7
|
+
from .parsers.table_chart_extractor import ChartTablePDFParser
|
8
|
+
from .version import __version__
|
9
|
+
from .ui import build_demo, launch_ui
|
10
|
+
|
11
|
+
__all__ = [
|
12
|
+
'StructuredPDFParser',
|
13
|
+
'ChartTablePDFParser',
|
14
|
+
'build_demo',
|
15
|
+
'launch_ui',
|
16
|
+
'__version__'
|
17
|
+
]
|
18
|
+
|
19
|
+
# Package metadata
|
20
|
+
__author__ = 'Adem Boukhris'
|
21
|
+
__email__ = 'boukhrisadam98@gmail.com' # Replace with your email
|
19
22
|
__description__ = 'Parse, extract, and analyze documents with ease'
|
doctra/cli/main.py
CHANGED
@@ -259,6 +259,7 @@ def parse(pdf_path: Path, output_dir: Optional[Path], use_vlm: bool,
|
|
259
259
|
click.echo(f"📄 Processing: {pdf_path.name}")
|
260
260
|
parser.parse(str(pdf_path.absolute()))
|
261
261
|
click.echo("✅ Full document processing completed successfully!")
|
262
|
+
click.echo(f"📁 Output directory: {output_dir.absolute() if output_dir else 'outputs/'}")
|
262
263
|
|
263
264
|
except KeyboardInterrupt:
|
264
265
|
click.echo("\n⚠️ Processing interrupted by user", err=True)
|
@@ -444,6 +445,7 @@ def tables(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
|
|
444
445
|
click.echo(f"📄 Processing: {pdf_path.name}")
|
445
446
|
parser.parse(str(pdf_path), str(output_dir))
|
446
447
|
click.echo("✅ Table extraction completed successfully!")
|
448
|
+
click.echo(f"📁 Output directory: {output_dir.absolute()}")
|
447
449
|
|
448
450
|
except KeyboardInterrupt:
|
449
451
|
click.echo("\n⚠️ Extraction interrupted by user", err=True)
|
@@ -522,6 +524,7 @@ def both(pdf_path: Path, output_dir: Path, use_vlm: bool, vlm_provider: str,
|
|
522
524
|
click.echo(f"📄 Processing: {pdf_path.name}")
|
523
525
|
parser.parse(str(pdf_path), str(output_dir))
|
524
526
|
click.echo("✅ Chart and table extraction completed successfully!")
|
527
|
+
click.echo(f"📁 Output directory: {output_dir.absolute()}")
|
525
528
|
|
526
529
|
except KeyboardInterrupt:
|
527
530
|
click.echo("\n⚠️ Extraction interrupted by user", err=True)
|
@@ -818,8 +821,8 @@ def info():
|
|
818
821
|
|
819
822
|
# VLM providers
|
820
823
|
click.echo("\nVLM Providers:")
|
821
|
-
click.echo(" • Gemini (Google) - gemini-
|
822
|
-
click.echo(" • OpenAI - gpt-
|
824
|
+
click.echo(" • Gemini (Google) - gemini-2.5-pro, gemini-2.5-flash, gemini-2.5-flash-lite, gemini-2.0-flash")
|
825
|
+
click.echo(" • OpenAI - gpt-5, gpt-5-mini, gpt-4.1, gpt-4.1-mini, gpt-4o")
|
823
826
|
|
824
827
|
# Available layout models
|
825
828
|
click.echo("\nLayout Detection Models:")
|
doctra/cli/utils.py
CHANGED
@@ -263,7 +263,7 @@ def create_progress_callback(description: str, total: int):
|
|
263
263
|
"""
|
264
264
|
Create a progress callback function for use with processing operations.
|
265
265
|
|
266
|
-
Creates a tqdm progress bar and returns a callback function that
|
266
|
+
Creates a beautiful tqdm progress bar and returns a callback function that
|
267
267
|
can be used to update the progress during long-running operations.
|
268
268
|
|
269
269
|
:param description: Description text for the progress bar
|
@@ -271,9 +271,18 @@ def create_progress_callback(description: str, total: int):
|
|
271
271
|
:return: Callable progress callback function that takes an integer
|
272
272
|
representing the number of completed items
|
273
273
|
"""
|
274
|
-
|
274
|
+
import sys
|
275
|
+
from doctra.utils.progress import create_beautiful_progress_bar, create_notebook_friendly_bar
|
275
276
|
|
276
|
-
|
277
|
+
# Enhanced environment detection
|
278
|
+
is_notebook = "ipykernel" in sys.modules or "jupyter" in sys.modules
|
279
|
+
is_terminal = hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()
|
280
|
+
|
281
|
+
# Choose appropriate progress bar based on environment
|
282
|
+
if is_notebook:
|
283
|
+
pbar = create_notebook_friendly_bar(total=total, desc=description)
|
284
|
+
else:
|
285
|
+
pbar = create_beautiful_progress_bar(total=total, desc=description, leave=True)
|
277
286
|
|
278
287
|
def callback(completed: int):
|
279
288
|
pbar.n = completed
|
@@ -4,16 +4,15 @@ import os
|
|
4
4
|
import sys
|
5
5
|
import json
|
6
6
|
import tempfile
|
7
|
-
import logging
|
8
7
|
from dataclasses import dataclass, asdict
|
9
8
|
from typing import Dict, List, Any, Tuple, Optional
|
10
|
-
from tqdm import tqdm
|
11
9
|
|
12
10
|
from PIL import Image
|
13
11
|
from paddleocr import LayoutDetection # pip install paddleocr>=2.7.0.3
|
14
12
|
from doctra.utils.pdf_io import render_pdf_to_images
|
15
13
|
from doctra.engines.layout.layout_models import LayoutBox, LayoutPage
|
16
|
-
from doctra.utils.
|
14
|
+
from doctra.utils.progress import create_loading_bar
|
15
|
+
import warnings
|
17
16
|
|
18
17
|
|
19
18
|
class PaddleLayoutEngine:
|
@@ -39,7 +38,7 @@ class PaddleLayoutEngine:
|
|
39
38
|
(default: "PP-DocLayout_plus-L")
|
40
39
|
"""
|
41
40
|
self.model_name = model_name
|
42
|
-
self.model: Optional[LayoutDetection] = None
|
41
|
+
self.model: Optional["LayoutDetection"] = None
|
43
42
|
|
44
43
|
def _ensure_model(self) -> None:
|
45
44
|
"""
|
@@ -53,80 +52,16 @@ class PaddleLayoutEngine:
|
|
53
52
|
if self.model is not None:
|
54
53
|
return
|
55
54
|
|
56
|
-
#
|
57
|
-
with
|
58
|
-
#
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
original_tqdm_init(self, *args, **kwargs)
|
67
|
-
|
68
|
-
def silent_update(self, *args, **kwargs):
|
69
|
-
pass # Do nothing
|
70
|
-
|
71
|
-
def silent_close(self, *args, **kwargs):
|
72
|
-
pass # Do nothing
|
73
|
-
|
74
|
-
# More comprehensive output suppression
|
75
|
-
# Save original logging levels
|
76
|
-
original_levels = {}
|
77
|
-
loggers_to_silence = ['ppocr', 'paddle', 'PIL', 'urllib3', 'requests']
|
78
|
-
for logger_name in loggers_to_silence:
|
79
|
-
logger = logging.getLogger(logger_name)
|
80
|
-
original_levels[logger_name] = logger.level
|
81
|
-
logger.setLevel(logging.CRITICAL)
|
82
|
-
|
83
|
-
# Also try to silence the root logger temporarily
|
84
|
-
root_logger = logging.getLogger()
|
85
|
-
original_root_level = root_logger.level
|
86
|
-
root_logger.setLevel(logging.CRITICAL)
|
87
|
-
|
88
|
-
# Set environment variables that might help silence PaddlePaddle
|
89
|
-
old_env = {}
|
90
|
-
env_vars_to_set = {
|
91
|
-
'FLAGS_print_model_stats': '0',
|
92
|
-
'FLAGS_enable_parallel_graph': '0',
|
93
|
-
'GLOG_v': '4', # Only show fatal errors
|
94
|
-
'GLOG_logtostderr': '0',
|
95
|
-
'GLOG_alsologtostderr': '0'
|
96
|
-
}
|
97
|
-
|
98
|
-
for key, value in env_vars_to_set.items():
|
99
|
-
old_env[key] = os.environ.get(key)
|
100
|
-
os.environ[key] = value
|
101
|
-
|
102
|
-
try:
|
103
|
-
# Monkey patch tqdm
|
104
|
-
tqdm.__init__ = silent_init
|
105
|
-
tqdm.update = silent_update
|
106
|
-
tqdm.close = silent_close
|
107
|
-
|
108
|
-
# Silence Paddle's download/init noise with enhanced suppression
|
109
|
-
with suppress_output():
|
110
|
-
self.model = LayoutDetection(model_name=self.model_name)
|
111
|
-
|
112
|
-
finally:
|
113
|
-
# Restore tqdm methods
|
114
|
-
tqdm.__init__ = original_tqdm_init
|
115
|
-
tqdm.update = original_tqdm_update
|
116
|
-
tqdm.close = original_tqdm_close
|
117
|
-
|
118
|
-
# Restore logging levels
|
119
|
-
for logger_name, level in original_levels.items():
|
120
|
-
logging.getLogger(logger_name).setLevel(level)
|
121
|
-
root_logger.setLevel(original_root_level)
|
122
|
-
|
123
|
-
# Restore environment variables
|
124
|
-
for key, old_value in old_env.items():
|
125
|
-
if old_value is None:
|
126
|
-
os.environ.pop(key, None)
|
127
|
-
else:
|
128
|
-
os.environ[key] = old_value
|
129
|
-
|
55
|
+
# Beautiful loading progress bar (no logging suppression)
|
56
|
+
with create_loading_bar(f'Loading PaddleOCR layout model: "{self.model_name}"') as bar:
|
57
|
+
# Suppress specific paddle extension warning: "No ccache found"
|
58
|
+
with warnings.catch_warnings():
|
59
|
+
warnings.filterwarnings(
|
60
|
+
"ignore",
|
61
|
+
message=r"No ccache found.*",
|
62
|
+
category=UserWarning,
|
63
|
+
)
|
64
|
+
self.model = LayoutDetection(model_name=self.model_name)
|
130
65
|
bar.update(1)
|
131
66
|
|
132
67
|
def predict_pdf(
|
doctra/engines/vlm/provider.py
CHANGED
@@ -1,58 +1,86 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
# --- keep these imports to match your snippet style ---
|
4
|
-
import io
|
5
|
-
import PIL
|
6
|
-
import openai
|
7
|
-
import outlines
|
8
|
-
from pydantic import BaseModel
|
9
|
-
from google.genai import Client
|
10
|
-
from outlines.inputs import Image
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
:param
|
27
|
-
:param
|
28
|
-
:
|
29
|
-
:
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
if
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
# --- keep these imports to match your snippet style ---
|
4
|
+
import io
|
5
|
+
import PIL
|
6
|
+
import openai
|
7
|
+
import outlines
|
8
|
+
from pydantic import BaseModel
|
9
|
+
from google.genai import Client
|
10
|
+
from outlines.inputs import Image
|
11
|
+
from anthropic import Anthropic
|
12
|
+
# ------------------------------------------------------
|
13
|
+
|
14
|
+
def make_model(
|
15
|
+
vlm_provider: str | None = "gemini",
|
16
|
+
vlm_model: str | None = None,
|
17
|
+
*,
|
18
|
+
api_key: str | None = None,
|
19
|
+
):
|
20
|
+
"""
|
21
|
+
Build a callable Outlines model for VLM processing.
|
22
|
+
|
23
|
+
Creates an Outlines model instance configured for Gemini, OpenAI, Anthropic, or OpenRouter
|
24
|
+
providers. Only one backend is active at a time, with Gemini as the default.
|
25
|
+
|
26
|
+
:param vlm_provider: VLM provider to use ("gemini", "openai", or "anthropic", default: "gemini")
|
27
|
+
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
28
|
+
:param api_key: API key for the VLM provider (required for all providers)
|
29
|
+
:return: Configured Outlines model instance
|
30
|
+
:raises ValueError: If provider is unsupported or API key is missing
|
31
|
+
"""
|
32
|
+
vlm_provider = (vlm_provider or "gemini").lower()
|
33
|
+
|
34
|
+
# Set default models if not provided
|
35
|
+
if vlm_model is None:
|
36
|
+
if vlm_provider == "gemini":
|
37
|
+
vlm_model = "gemini-2.5-pro"
|
38
|
+
elif vlm_provider == "openai":
|
39
|
+
vlm_model = "gpt-5"
|
40
|
+
elif vlm_provider == "anthropic":
|
41
|
+
vlm_model = "claude-opus-4-1"
|
42
|
+
elif vlm_provider == "openrouter":
|
43
|
+
vlm_model = "x-ai/grok-4"
|
44
|
+
|
45
|
+
if vlm_provider == "gemini":
|
46
|
+
if not api_key:
|
47
|
+
raise ValueError("Gemini provider requires api_key to be passed to make_model(...).")
|
48
|
+
# Create the model (exactly like your snippet)
|
49
|
+
return outlines.from_gemini(
|
50
|
+
Client(api_key=api_key),
|
51
|
+
vlm_model,
|
52
|
+
)
|
53
|
+
|
54
|
+
if vlm_provider == "openai":
|
55
|
+
if not api_key:
|
56
|
+
raise ValueError("OpenAI provider requires api_key to be passed to make_model(...).")
|
57
|
+
# this part is for the openai models (exactly like your snippet)
|
58
|
+
return outlines.from_openai(
|
59
|
+
openai.OpenAI(api_key=api_key),
|
60
|
+
vlm_model,
|
61
|
+
)
|
62
|
+
|
63
|
+
if vlm_provider == "anthropic":
|
64
|
+
if not api_key:
|
65
|
+
raise ValueError("Anthropic provider requires api_key to be passed to make_model(...).")
|
66
|
+
# Create the Anthropic client and model (exactly like your snippet)
|
67
|
+
client = Anthropic(api_key=api_key)
|
68
|
+
return outlines.from_anthropic(
|
69
|
+
client,
|
70
|
+
vlm_model,
|
71
|
+
)
|
72
|
+
|
73
|
+
if vlm_provider == "openrouter":
|
74
|
+
if not api_key:
|
75
|
+
raise ValueError("OpenRouter provider requires api_key to be passed to make_model(...).")
|
76
|
+
# Create the Anthropic client and model (exactly like your snippet)
|
77
|
+
client = openai.OpenAI(
|
78
|
+
base_url="https://openrouter.ai/api/v1",
|
79
|
+
api_key=api_key,
|
80
|
+
)
|
81
|
+
return outlines.from_openai(
|
82
|
+
client,
|
83
|
+
vlm_model
|
84
|
+
)
|
85
|
+
|
86
|
+
raise ValueError(f"Unsupported provider: {vlm_provider}. Use 'gemini', 'openai', or 'anthropic'.")
|
doctra/engines/vlm/service.py
CHANGED
@@ -15,9 +15,12 @@ class VLMStructuredExtractor:
|
|
15
15
|
from images using Vision Language Models (VLM) with Outlines for type safety.
|
16
16
|
|
17
17
|
Usage:
|
18
|
-
vlm = VLMStructuredExtractor(vlm_provider="gemini", api_key="YOUR_KEY"
|
18
|
+
vlm = VLMStructuredExtractor(vlm_provider="gemini", api_key="YOUR_KEY")
|
19
19
|
chart = vlm.extract_chart("/abs/path/chart.jpg")
|
20
20
|
table = vlm.extract_table("/abs/path/table.jpg")
|
21
|
+
|
22
|
+
# Or with Anthropic:
|
23
|
+
vlm = VLMStructuredExtractor(vlm_provider="anthropic", api_key="YOUR_KEY")
|
21
24
|
"""
|
22
25
|
|
23
26
|
def __init__(
|
@@ -26,25 +29,21 @@ class VLMStructuredExtractor:
|
|
26
29
|
vlm_model: str | None = None,
|
27
30
|
*,
|
28
31
|
api_key: str | None = None,
|
29
|
-
debug: bool = True,
|
30
32
|
):
|
31
33
|
"""
|
32
34
|
Initialize the VLMStructuredExtractor with provider configuration.
|
33
35
|
|
34
|
-
Sets up the VLM model
|
35
|
-
from images.
|
36
|
+
Sets up the VLM model for structured data extraction from images.
|
36
37
|
|
37
|
-
:param vlm_provider: VLM provider to use ("gemini" or "
|
38
|
+
:param vlm_provider: VLM provider to use ("gemini", "openai", "anthropic", or "openrouter", default: "gemini")
|
38
39
|
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
39
|
-
:param api_key: API key for the VLM provider (required for
|
40
|
-
:param debug: Whether to enable debug output for error handling (default: True)
|
40
|
+
:param api_key: API key for the VLM provider (required for all providers)
|
41
41
|
"""
|
42
42
|
self.model = make_model(
|
43
43
|
vlm_provider,
|
44
44
|
vlm_model,
|
45
45
|
api_key=api_key,
|
46
46
|
)
|
47
|
-
self.debug = debug
|
48
47
|
|
49
48
|
def _call(self, prompt_text: str, image_path: str, schema):
|
50
49
|
"""
|
@@ -68,13 +67,10 @@ class VLMStructuredExtractor:
|
|
68
67
|
img = img.convert("RGB")
|
69
68
|
|
70
69
|
prompt = [prompt_text, Image(img)]
|
71
|
-
|
70
|
+
result = self.model(prompt, schema)
|
71
|
+
|
72
|
+
return result
|
72
73
|
except Exception as e:
|
73
|
-
if self.debug:
|
74
|
-
import traceback
|
75
|
-
print(f"[VLM ERROR] while processing: {image_path}")
|
76
|
-
traceback.print_exc()
|
77
|
-
print(f"[VLM ERROR] type={type(e).__name__} msg={e}")
|
78
74
|
# Re-raise so caller can handle/log too
|
79
75
|
raise
|
80
76
|
|